Regression Analysis for Water Temperature Data

The goal of this analysis is to develop a regression to predict water temperature when data are missing or the time series is incomplete.

Currently this analysis is performed for Feather River and Yuba River. The resulting dataset with predicted values are saved and integrated in the development of a water temperature dataset.

Data used to build models: Butte Creek

Butte Creek is used to build the regression models because the time series is complete and the data are high quality.

Feather River

Data Preparation

  1. Pull in gage data from (GRL will represent the High Flow Channel (HFC) and FRA will represent the Low Flow Channel (LFC):
  • GRL (2003-03-05 to 2007-06-01 H; 2020-01-04 to present E): located after Thermalito Afterbay
  • FRA (2002-01-01 to present): located between Lake Oroville and Thermalito Afterbay
  1. Prepare datasets for regression analysis (dataset with no missing data to train and dataset with missing data to predict)

LFC

Plot of mean temp for Feather River LFC and Butte Creek

## `geom_smooth()` using formula = 'y ~ x'

Plot of min temp for Feather River LFC and Butte Creek

## `geom_smooth()` using formula = 'y ~ x'

Plot of max temp for Feather River LFC and Butte Creek

## `geom_smooth()` using formula = 'y ~ x'

Building Mean Regression

# LFC regression and predictions

# MEAN Regression
split <-rsample::initial_split(feather_lfc_regression_data_mean, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_lfc_mod_mean <- lm(temp ~ date + butte_temp, data = train)
summary(feather_lfc_mod_mean)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8490 -0.8254  0.0363  0.8137  6.0039 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.271e+00  1.117e+00  -2.930  0.00344 ** 
## date         5.582e-04  5.938e-05   9.399  < 2e-16 ***
## butte_temp   4.184e-01  6.002e-03  69.708  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.336 on 1565 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7612, Adjusted R-squared:  0.7609 
## F-statistic:  2495 on 2 and 1565 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_lfc_mod_mean, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.0857866

Building Min Regression

# MIN Regression
split <-rsample::initial_split(feather_lfc_regression_data_min, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_lfc_mod_min <- lm(temp ~ date + butte_temp, data = train)
summary(feather_lfc_mod_min)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6976 -0.8362  0.0226  0.7779  6.1884 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.624e+00  1.123e+00  -2.337   0.0196 *  
## date         5.267e-04  5.978e-05   8.811   <2e-16 ***
## butte_temp   3.921e-01  6.698e-03  58.539   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.346 on 1565 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6944, Adjusted R-squared:  0.694 
## F-statistic:  1778 on 2 and 1565 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_lfc_mod_min, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.08469061

Building Max Regression

# MAX Regression
split <-rsample::initial_split(feather_lfc_regression_data_max, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_lfc_mod_max <- lm(temp ~ date + butte_temp, data = train)
summary(feather_lfc_mod_max)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3015 -0.8102  0.0117  0.8065  6.1645 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.732e+00  1.214e+00  -3.075  0.00214 ** 
## date         5.820e-04  6.453e-05   9.019  < 2e-16 ***
## butte_temp   4.356e-01  5.923e-03  73.546  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.449 on 1565 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.779,  Adjusted R-squared:  0.7787 
## F-statistic:  2758 on 2 and 1565 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_lfc_mod_max, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.07865859

Predictions

# Predictions
feather_gap_predicted_mean_lfc <- predict(feather_lfc_mod_mean, feather_gap_mean)
feather_lfc_mean_predicted <- feather_gap_mean |> 
  mutate(value = feather_gap_predicted_mean_lfc,
         statistic = "mean") |> 
  select(date, value, statistic)
ggplot(feather_lfc_mean_predicted, aes(x = date, y = value)) +
  geom_line()

feather_gap_predicted_min_lfc <- predict(feather_lfc_mod_min, feather_gap_min)
feather_lfc_min_predicted <- feather_gap_min |> 
  mutate(value = feather_gap_predicted_min_lfc,
         statistic = "min") |> 
  select(date, value, statistic)
ggplot(feather_lfc_min_predicted, aes(x = date, y = value)) +
  geom_line()

feather_gap_predicted_max_lfc <- predict(feather_lfc_mod_max, feather_gap_max)
feather_lfc_max_predicted <- feather_gap_max |> 
  mutate(value = feather_gap_predicted_max_lfc,
         statistic = "max") |> 
  select(date, value, statistic)
ggplot(feather_lfc_max_predicted, aes(x = date, y = value)) +
  geom_line()

Full dataset

feather_gap_lfc <- bind_rows(feather_lfc_max_predicted,
                             feather_lfc_mean_predicted,
                             feather_lfc_min_predicted) |> 
  mutate(stream = "feather river",
         site_group = "upper feather lfc") %>% 
  pivot_wider(id_cols = c(date, stream, site_group), values_from = "value", names_from = "statistic") |>
  rename(mean_i = mean,
         max_i = max,
         min_i = min) |> 
  full_join(feather_lfc_format |> 
              pivot_wider(id_cols = c(stream, date, gage_agency, gage_number, site_group), values_from = "value", names_from = "statistic")) |> 
  mutate(gage_agency = ifelse(is.na(mean), "interpolated", gage_agency),
         gage_number = ifelse(is.na(mean), "interpolated", gage_number),
         mean = ifelse(is.na(mean), mean_i, mean),
         min = ifelse(is.na(min), min_i, min),
         max = ifelse(is.na(max), max_i, max)) |>
  select(-c(min_i, mean_i, max_i)) |> 
  pivot_longer(cols = mean:min, values_to = "value", names_to = "statistic") |> 
  group_by(stream, date, statistic, gage_agency, gage_number, site_group) |> glimpse()
## Joining with `by = join_by(date, stream, site_group)`
## Rows: 26,367
## Columns: 7
## Groups: stream, date, statistic, gage_agency, gage_number, site_group [26,367]
## $ date        <date> 1999-12-31, 1999-12-31, 1999-12-31, 2000-01-01, 2000-01-0…
## $ stream      <chr> "feather river", "feather river", "feather river", "feathe…
## $ site_group  <chr> "upper feather lfc", "upper feather lfc", "upper feather l…
## $ gage_agency <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ gage_number <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ statistic   <chr> "mean", "max", "min", "mean", "max", "min", "mean", "max",…
## $ value       <dbl> 4.852591, 4.735839, 5.028812, 4.922884, 5.172034, 4.833300…
ggplot() +
  geom_line(data = feather_gap_lfc, aes(x = date, y = value, color = statistic)) +
  theme_minimal()
## Warning: Removed 3 rows containing missing values (`geom_line()`).

HFC

Plot of mean temp for Feather River HFC and Butte Creek

# plot for mean
ggplot(data = feather_hfc_regression_data_mean, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Plot of min temp for Feather River HFC and Butte Creek

# plot for mean
ggplot(data = feather_hfc_regression_data_min, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Plot of max temp for Feather River HFC and Butte Creek

# plot for mean
ggplot(data = feather_hfc_regression_data_max, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Building Mean Regression

# HFC regression and predictions

# MEAN Regression
split <-rsample::initial_split(feather_hfc_regression_data_mean, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_hfc_mod_mean <- lm(temp ~ date + butte_temp, data = train)
summary(feather_hfc_mod_mean)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.023  -1.358  -0.122   1.351   4.850 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.9592732  2.3046957   5.189 2.56e-07 ***
## date        -0.0004860  0.0001811  -2.684  0.00741 ** 
## butte_temp   0.6784402  0.0132344  51.263  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.117 on 1001 degrees of freedom
## Multiple R-squared:  0.7242, Adjusted R-squared:  0.7236 
## F-statistic:  1314 on 2 and 1001 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_hfc_mod_mean, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] NA

Building Min Regression

# MIN Regression
split <-rsample::initial_split(feather_hfc_regression_data_min, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_hfc_mod_min <- lm(temp ~ date + butte_temp, data = train)
summary(feather_hfc_mod_min)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.9250  -1.1006   0.0304   1.2519   4.3854 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15.6190437  2.1195204   7.369 3.59e-13 ***
## date        -0.0007780  0.0001666  -4.670 3.42e-06 ***
## butte_temp   0.6759568  0.0134890  50.112  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.988 on 1000 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7152, Adjusted R-squared:  0.7147 
## F-statistic:  1256 on 2 and 1000 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_hfc_mod_min, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.1148937

Building Max Regression

# MAX Regression
split <-rsample::initial_split(feather_hfc_regression_data_max, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_hfc_mod_max <- lm(temp ~ date + butte_temp, data = train)
summary(feather_hfc_mod_max)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.049  -1.406  -0.159   1.845   5.769 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 17.0119891  2.9825730   5.704 1.54e-08 ***
## date        -0.0007998  0.0002344  -3.412 0.000671 ***
## butte_temp   0.6028812  0.0150607  40.030  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.708 on 1000 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6158, Adjusted R-squared:  0.615 
## F-statistic: 801.2 on 2 and 1000 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_hfc_mod_max, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.1154672

Predictions

# Predictions
feather_gap_predicted_mean_hfc <- predict(feather_hfc_mod_mean, feather_gap_mean)
feather_hfc_mean_predicted <- feather_gap_mean |> 
  mutate(value = feather_gap_predicted_mean_hfc,
         statistic = "mean") |> 
  select(date, value, statistic)
ggplot(feather_hfc_mean_predicted, aes(x = date, y = value)) +
  geom_line()

feather_gap_predicted_min_hfc <- predict(feather_hfc_mod_min, feather_gap_min)
feather_hfc_min_predicted <- feather_gap_min |> 
  mutate(value = feather_gap_predicted_min_hfc,
         statistic = "min") |> 
  select(date, value, statistic)
ggplot(feather_hfc_min_predicted, aes(x = date, y = value)) +
  geom_line()

feather_gap_predicted_max_hfc <- predict(feather_hfc_mod_max, feather_gap_max)
feather_hfc_max_predicted <- feather_gap_max |> 
  mutate(value = feather_gap_predicted_max_hfc,
         statistic = "max") |> 
  select(date, value, statistic)
ggplot(feather_hfc_max_predicted, aes(x = date, y = value)) +
  geom_line()

Full dataset

feather_gap_hfc <- bind_rows(feather_hfc_max_predicted,
                             feather_hfc_mean_predicted,
                             feather_hfc_min_predicted) |> 
  mutate(stream = "feather river",
         site_group = "upper feather hfc") %>% 
  pivot_wider(id_cols = c(date, stream, site_group), values_from = "value", names_from = "statistic") |>
  rename(mean_i = mean,
         max_i = max,
         min_i = min) |> 
  full_join(feather_hfc_format |> 
              pivot_wider(id_cols = c(stream, date, gage_agency, gage_number, site_group), values_from = "value", names_from = "statistic")) |> 
  mutate(gage_agency = ifelse(is.na(mean), "interpolated", gage_agency),
         gage_number = ifelse(is.na(mean), "interpolated", gage_number),
         mean = ifelse(is.na(mean), mean_i, mean),
         min = ifelse(is.na(min), min_i, min),
         max = ifelse(is.na(max), max_i, max)) |> 
  select(-c(min_i, mean_i, max_i)) |> 
  pivot_longer(cols = mean:min, values_to = "value", names_to = "statistic") |> 
  group_by(stream, date, statistic, gage_agency, gage_number, site_group) |> glimpse()
## Joining with `by = join_by(date, stream, site_group)`
## Rows: 26,403
## Columns: 7
## Groups: stream, date, statistic, gage_agency, gage_number, site_group [26,403]
## $ date        <date> 1999-12-31, 1999-12-31, 1999-12-31, 2000-01-01, 2000-01-0…
## $ stream      <chr> "feather river", "feather river", "feather river", "feathe…
## $ site_group  <chr> "upper feather hfc", "upper feather hfc", "upper feather h…
## $ gage_agency <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ gage_number <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ statistic   <chr> "mean", "max", "min", "mean", "max", "min", "mean", "max",…
## $ value       <dbl> 9.890990, 11.143629, 10.339793, 10.003578, 11.745711, 10.0…
ggplot() +
  geom_line(data = feather_gap_hfc, aes(x = date, y = value, color = statistic)) +
  theme_minimal()
## Warning: Removed 3 rows containing missing values (`geom_line()`).

Yuba River

Data Prepartion

  1. Pull in gage data from YR7 CDEC gage, however, this only contains data from 2020 onwards. Temperature data collected along with RST data was originally used to fill the gap prior to 2020; however, due to inconsistencies in these data sources the resulting predicted mean values were lower than the min values as the RST data only has mean data.

  2. Prepare datasets for regression analysis (dataset with no missing data to train and dataset with missing data to predict)

Plot of mean temp for Yuba River and Butte Creek

# plot for mean
ggplot(data = yuba_regression_data_mean, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Plot of min temp for Yuba River and Butte Creek

# plot for mean
ggplot(data = yuba_regression_data_min, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Plot of max temp for Yuba River and Butte Creek

# plot for mean
ggplot(data = yuba_regression_data_max, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Building Mean Regression

# MEAN Regression
split <-rsample::initial_split(yuba_regression_data_mean, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
yuba_mod_mean <- lm(temp ~ date + butte_temp, data = train)
summary(yuba_mod_mean)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9019 -1.0516  0.0279  1.0812  3.1416 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.092e+01  1.593e+00  13.129   <2e-16 ***
## date        -7.705e-04  8.378e-05  -9.197   <2e-16 ***
## butte_temp   5.884e-01  6.879e-03  85.543   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.423 on 1297 degrees of freedom
## Multiple R-squared:  0.852,  Adjusted R-squared:  0.8517 
## F-statistic:  3732 on 2 and 1297 DF,  p-value: < 2.2e-16
test_predict <- predict(yuba_mod_mean, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.08459931

Building Min Regression

# MIN Regression
split <-rsample::initial_split(yuba_regression_data_min, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
yuba_mod_min <- lm(temp ~ date + butte_temp, data = train)
summary(yuba_mod_min)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.648 -1.032 -0.009  1.053  3.257 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.291e+01  1.601e+00   14.31   <2e-16 ***
## date        -8.818e-04  8.412e-05  -10.48   <2e-16 ***
## butte_temp   5.401e-01  7.560e-03   71.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.419 on 1297 degrees of freedom
## Multiple R-squared:  0.8027, Adjusted R-squared:  0.8024 
## F-statistic:  2638 on 2 and 1297 DF,  p-value: < 2.2e-16
test_predict <- predict(yuba_mod_min, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.09698594

Building Max Regression

# MAX Regression
split <-rsample::initial_split(yuba_regression_data_max, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
yuba_mod_max <- lm(temp ~ date + butte_temp, data = train)
summary(yuba_mod_max)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6824 -0.8997  0.0082  0.9982  6.2881 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.889e+01  1.613e+00  11.713  < 2e-16 ***
## date        -6.650e-04  8.466e-05  -7.855 8.34e-15 ***
## butte_temp   6.400e-01  6.234e-03 102.669  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.418 on 1297 degrees of freedom
## Multiple R-squared:  0.8924, Adjusted R-squared:  0.8922 
## F-statistic:  5377 on 2 and 1297 DF,  p-value: < 2.2e-16
test_predict <- predict(yuba_mod_max, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.07724969

Predictions

# Predictions
yuba_gap_predicted_mean <- predict(yuba_mod_mean, yuba_gap_mean)
yuba_mean_predicted <- yuba_gap_mean |> 
  mutate(value = yuba_gap_predicted_mean,
         statistic = "mean_predicted") |> 
  select(date, value, statistic)
ggplot(yuba_mean_predicted, aes(x = date, y = value)) +
  geom_line()
## Warning: Removed 1 row containing missing values (`geom_line()`).

yuba_gap_predicted_min <- predict(yuba_mod_min, yuba_gap_min)
yuba_min_predicted <- yuba_gap_min |> 
  mutate(value = yuba_gap_predicted_min,
         statistic = "min_predicted") |> 
  select(date, value, statistic)
ggplot(yuba_min_predicted, aes(x = date, y = value)) +
  geom_line()
## Warning: Removed 1 row containing missing values (`geom_line()`).

yuba_gap_predicted_max <- predict(yuba_mod_max, yuba_gap_max)
yuba_max_predicted <- yuba_gap_max |> 
  mutate(value = yuba_gap_predicted_max,
         statistic = "max_predicted") |> 
  select(date, value, statistic)
ggplot(yuba_max_predicted, aes(x = date, y = value)) +
  geom_line()
## Warning: Removed 1 row containing missing values (`geom_line()`).

Full dataset

yuba_gap <- bind_rows(yuba_max_predicted,
                      yuba_mean_predicted,
                      yuba_min_predicted) |> 
  mutate(stream = "yuba river") |> 
  pivot_wider(id_cols = c(date, stream), values_from = "value", names_from = "statistic") |> 
  full_join(yuba_format |> 
              pivot_wider(id_cols = c(stream, date, gage_agency, gage_number), values_from = "value", names_from = "statistic")) |> 
  mutate(gage_agency = ifelse(is.na(mean), "interpolated", gage_agency),
         gage_number = ifelse(is.na(mean), "interpolated", gage_number),
         mean = ifelse(is.na(mean), mean_predicted, mean),
         min = ifelse(is.na(min), min_predicted, min),
         max = ifelse(is.na(max), max_predicted, max)) |> 
  select(-c(mean_predicted, max_predicted, min_predicted)) |> 
    pivot_longer(cols = mean:min, values_to = "value", names_to = "statistic") |>
  group_by(stream, date, statistic, gage_agency, gage_number) |> glimpse()
## Joining with `by = join_by(date, stream)`
## Rows: 26,367
## Columns: 6
## Groups: stream, date, statistic, gage_agency, gage_number [26,367]
## $ date        <date> 1999-12-31, 1999-12-31, 1999-12-31, 2000-01-01, 2000-01-0…
## $ stream      <chr> "yuba river", "yuba river", "yuba river", "yuba river", "y…
## $ gage_agency <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ gage_number <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ statistic   <chr> "mean", "max", "min", "mean", "max", "min", "mean", "max",…
## $ value       <dbl> 15.30452, 14.67537, 15.83686, 15.40182, 15.31474, 15.56593…
ggplot() +
  geom_line(data = yuba_gap, aes(x = date, y = value, color = statistic)) +
  theme_minimal()
## Warning: Removed 3 rows containing missing values (`geom_line()`).